data_path = "/Users/apple/Downloads/Concrete_Data.csv"
# File was initially converted to csv from xls using bash script because OSX
# does not have proper support for csv
library(data.table)
library(ggplot2)
library(stats)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
## http://homerhanumat.github.io/tigerstats
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
##
## dotPlot
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:mosaic':
##
## deltaMethod, logit
## The following object is masked from 'package:dplyr':
##
## recode
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(readxl)
library(ggpubr)
library(BBmisc)
##
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
##
## coalesce, collapse
## The following object is masked from 'package:grid':
##
## explode
## The following object is masked from 'package:nlme':
##
## collapse
## The following object is masked from 'package:base':
##
## isFALSE
#
# 1. Read the dataset into the R environment
#
data = fread(data_path, check.names = TRUE)
head(data)
## Cement..component.1..kg.in.a.m.3.mixture.
## 1: 540.0
## 2: 540.0
## 3: 332.5
## 4: 332.5
## 5: 198.6
## 6: 266.0
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## 1: 0.0
## 2: 0.0
## 3: 142.5
## 4: 142.5
## 5: 132.4
## 6: 114.0
## Fly.Ash..component.3..kg.in.a.m.3.mixture.
## 1: 0.0
## 2: 0.0
## 3: 0.0
## 4: 0.0
## 5: 0.0
## 6: 0.0
## Water...component.4..kg.in.a.m.3.mixture.
## 1: 162.0
## 2: 162.0
## 3: 228.0
## 4: 228.0
## 5: 192.0
## 6: 228.0
## Superplasticizer..component.5..kg.in.a.m.3.mixture.
## 1: 2.5
## 2: 2.5
## 3: 0.0
## 4: 0.0
## 5: 0.0
## 6: 0.0
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## 1: 1040.0
## 2: 1055.0
## 3: 932.0
## 4: 932.0
## 5: 978.4
## 6: 932.0
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. Age..day.
## 1: 676.0 28
## 2: 676.0 28
## 3: 594.0 270
## 4: 594.0 365
## 5: 825.5 360
## 6: 670.0 90
## Concrete.compressive.strength.MPa..megapascals..
## 1: 79.99
## 2: 61.89
## 3: 40.27
## 4: 41.05
## 5: 44.30
## 6: 47.03
#
# 2. Descriptive Analysis
#
# Structure of the data
str(data)
## Classes 'data.table' and 'data.frame': 1030 obs. of 9 variables:
## $ Cement..component.1..kg.in.a.m.3.mixture. : chr "540.0 " "540.0 " "332.5 " "332.5 " ...
## $ Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.: chr "0.0 " "0.0 " "142.5 " "142.5 " ...
## $ Fly.Ash..component.3..kg.in.a.m.3.mixture. : chr "0.0 " "0.0 " "0.0 " "0.0 " ...
## $ Water...component.4..kg.in.a.m.3.mixture. : chr "162.0 " "162.0 " "228.0 " "228.0 " ...
## $ Superplasticizer..component.5..kg.in.a.m.3.mixture. : chr "2.5 " "2.5 " "0.0 " "0.0 " ...
## $ Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. : chr "1040.0 " "1055.0 " "932.0 " "932.0 " ...
## $ Fine.Aggregate..component.7..kg.in.a.m.3.mixture. : chr "676.0 " "676.0 " "594.0 " "594.0 " ...
## $ Age..day. : chr "28 " "28 " "270 " "365 " ...
## $ Concrete.compressive.strength.MPa..megapascals.. : chr "79.99 " "61.89 " "40.27 " "41.05 " ...
## - attr(*, ".internal.selfref")=<externalptr>
# Column names
column_names = names(data)
target_column = column_names[length(column_names)]
input_column = column_names[1:length(column_names)-1]
column_names
## [1] "Cement..component.1..kg.in.a.m.3.mixture."
## [2] "Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture."
## [3] "Fly.Ash..component.3..kg.in.a.m.3.mixture."
## [4] "Water...component.4..kg.in.a.m.3.mixture."
## [5] "Superplasticizer..component.5..kg.in.a.m.3.mixture."
## [6] "Coarse.Aggregate...component.6..kg.in.a.m.3.mixture."
## [7] "Fine.Aggregate..component.7..kg.in.a.m.3.mixture."
## [8] "Age..day."
## [9] "Concrete.compressive.strength.MPa..megapascals.."
target_column
## [1] "Concrete.compressive.strength.MPa..megapascals.."
input_column
## [1] "Cement..component.1..kg.in.a.m.3.mixture."
## [2] "Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture."
## [3] "Fly.Ash..component.3..kg.in.a.m.3.mixture."
## [4] "Water...component.4..kg.in.a.m.3.mixture."
## [5] "Superplasticizer..component.5..kg.in.a.m.3.mixture."
## [6] "Coarse.Aggregate...component.6..kg.in.a.m.3.mixture."
## [7] "Fine.Aggregate..component.7..kg.in.a.m.3.mixture."
## [8] "Age..day."
# Dimension of data
dim(data)
## [1] 1030 9
# Data Preprocessing
# Convert character columns to numeric
convertToNumeric = function(X) {
X1 = as.numeric(X)
return(X1)
}
data = data[, lapply(.SD,convertToNumeric)]
# Check for null values
colSums(is.na(data)) # -> No null values
## Cement..component.1..kg.in.a.m.3.mixture.
## 0
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## 0
## Fly.Ash..component.3..kg.in.a.m.3.mixture.
## 0
## Water...component.4..kg.in.a.m.3.mixture.
## 0
## Superplasticizer..component.5..kg.in.a.m.3.mixture.
## 0
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## 0
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## 0
## Age..day.
## 0
## Concrete.compressive.strength.MPa..megapascals..
## 0
# Histogram for numerical data
par(mfrow=c(3,3))
for(name in names(data)) {
X = data[[name]]
print(name)
print(summary(X))
hist(X, main=name)
}
## [1] "Cement..component.1..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 102.0 192.4 272.9 281.2 350.0 540.0
## [1] "Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 22.00 73.89 142.95 359.40
## [1] "Fly.Ash..component.3..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 54.19 118.30 200.10
## [1] "Water...component.4..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 121.8 164.9 185.0 181.6 192.0 247.0
## [1] "Superplasticizer..component.5..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 6.300 6.203 10.200 32.200
## [1] "Coarse.Aggregate...component.6..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 801.0 932.0 968.0 972.9 1029.4 1145.0
## [1] "Fine.Aggregate..component.7..kg.in.a.m.3.mixture."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 594.0 731.0 779.5 773.6 824.0 992.6
## [1] "Age..day."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 28.00 45.66 56.00 365.00
## [1] "Concrete.compressive.strength.MPa..megapascals.."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.33 23.71 34.45 35.82 46.13 82.60

par(mfrow=c(1,1))
# Density plot for target variable
ggdensity(data, x = "Concrete.compressive.strength.MPa..megapascals..",
fill = "#0073C2FF", color = "#0073C2FF",
add = "mean", rug = TRUE)
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.
## Warning: geom_vline(): Ignoring `data` because `xintercept` was provided.

# Boxplot for input columns
for(name in input_column) {
boxplot(data[[name]], main=name)
}








# QQ Plot for input columns against target column
for(name in input_column) {
qqplot(data[[name]], data$Concrete.compressive.strength.MPa..megapascals.., xlab=name)
}








# Analysis on each column
## Column Cement..component.1..kg.in.a.m.3.mixture.
a = ggplot(data, aes(x = Cement..component.1..kg.in.a.m.3.mixture.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Cement..component.1..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Cement..component.1..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

## Column Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
a = ggplot(data, aes(x = Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

## Column Fly.Ash..component.3..kg.in.a.m.3.mixture.
a = ggplot(data, aes(x = Fly.Ash..component.3..kg.in.a.m.3.mixture.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Fly.Ash..component.3..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Fly.Ash..component.3..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

## Column Water...component.4..kg.in.a.m.3.mixture.
a = ggplot(data, aes(x = Water...component.4..kg.in.a.m.3.mixture.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Water...component.4..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Water...component.4..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

## Column Superplasticizer..component.5..kg.in.a.m.3.mixture.
a = ggplot(data, aes(x = Superplasticizer..component.5..kg.in.a.m.3.mixture.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Superplasticizer..component.5..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Superplasticizer..component.5..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

## Column Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
a = ggplot(data, aes(x = Fine.Aggregate..component.7..kg.in.a.m.3.mixture.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Fine.Aggregate..component.7..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Fine.Aggregate..component.7..kg.in.a.m.3.mixture.)),
linetype = "dashed", size = 0.6)

## Column Age..day.
a = ggplot(data, aes(x = Age..day.))
a + geom_density() +
geom_vline(aes(xintercept = mean(Age..day.)),
linetype = "dashed", size = 0.6)

a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(Age..day.)),
linetype = "dashed", size = 0.6)

#
# 3. Perform required Diagnostic data analytics on the explored dataset.
#
# Correlation Coefficients
cor(select_if(data, is.numeric)[,])
## Cement..component.1..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. 1.00000000
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. -0.27520026
## Fly.Ash..component.3..kg.in.a.m.3.mixture. -0.39747855
## Water...component.4..kg.in.a.m.3.mixture. -0.08150687
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.09241390
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.10936104
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. -0.22270327
## Age..day. 0.08194618
## Concrete.compressive.strength.MPa..megapascals.. 0.49782924
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. -0.27520026
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 1.00000000
## Fly.Ash..component.3..kg.in.a.m.3.mixture. -0.32358377
## Water...component.4..kg.in.a.m.3.mixture. 0.10734660
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.04338346
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.28400776
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. -0.28160129
## Age..day. -0.04424505
## Concrete.compressive.strength.MPa..megapascals.. 0.13482625
## Fly.Ash..component.3..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. -0.397478547
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. -0.323583770
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 1.000000000
## Water...component.4..kg.in.a.m.3.mixture. -0.257057836
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.377399697
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.009979403
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 0.079094391
## Age..day. -0.154371716
## Concrete.compressive.strength.MPa..megapascals.. -0.105758502
## Water...component.4..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. -0.08150687
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 0.10734660
## Fly.Ash..component.3..kg.in.a.m.3.mixture. -0.25705784
## Water...component.4..kg.in.a.m.3.mixture. 1.00000000
## Superplasticizer..component.5..kg.in.a.m.3.mixture. -0.65746099
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.18236084
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. -0.45069081
## Age..day. 0.27760928
## Concrete.compressive.strength.MPa..megapascals.. -0.28960079
## Superplasticizer..component.5..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. 0.09241390
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 0.04338346
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 0.37739970
## Water...component.4..kg.in.a.m.3.mixture. -0.65746099
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 1.00000000
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.26608660
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 0.22258833
## Age..day. -0.19268924
## Concrete.compressive.strength.MPa..megapascals.. 0.36602184
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. -0.109361038
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. -0.284007756
## Fly.Ash..component.3..kg.in.a.m.3.mixture. -0.009979403
## Water...component.4..kg.in.a.m.3.mixture. -0.182360840
## Superplasticizer..component.5..kg.in.a.m.3.mixture. -0.266086598
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 1.000000000
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. -0.178496507
## Age..day. -0.003015880
## Concrete.compressive.strength.MPa..megapascals.. -0.164934614
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## Cement..component.1..kg.in.a.m.3.mixture. -0.22270327
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. -0.28160129
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 0.07909439
## Water...component.4..kg.in.a.m.3.mixture. -0.45069081
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.22258833
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.17849651
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 1.00000000
## Age..day. -0.15609400
## Concrete.compressive.strength.MPa..megapascals.. -0.16723752
## Age..day.
## Cement..component.1..kg.in.a.m.3.mixture. 0.08194618
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. -0.04424505
## Fly.Ash..component.3..kg.in.a.m.3.mixture. -0.15437172
## Water...component.4..kg.in.a.m.3.mixture. 0.27760928
## Superplasticizer..component.5..kg.in.a.m.3.mixture. -0.19268924
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.00301588
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. -0.15609400
## Age..day. 1.00000000
## Concrete.compressive.strength.MPa..megapascals.. 0.32887300
## Concrete.compressive.strength.MPa..megapascals..
## Cement..component.1..kg.in.a.m.3.mixture. 0.4978292
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 0.1348262
## Fly.Ash..component.3..kg.in.a.m.3.mixture. -0.1057585
## Water...component.4..kg.in.a.m.3.mixture. -0.2896008
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.3660218
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. -0.1649346
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. -0.1672375
## Age..day. 0.3288730
## Concrete.compressive.strength.MPa..megapascals.. 1.0000000
# Correlation Plot
M = cor(select_if(data, is.numeric))
corrplot(M, method = "number", tl.pos='n') # Due to large size of column names, they are removed

# Statistical Analysis
favstats(~Cement..component.1..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 102 192.375 272.9 350 540 281.1664 104.5077 1030 0
favstats(~Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 0 0 22 142.95 359.4 73.89485 86.27934 1030 0
favstats(~Fly.Ash..component.3..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 0 0 0 118.3 200.1 54.18738 63.99596 1030 0
favstats(~Water...component.4..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 121.8 164.9 185 192 247 181.5649 21.35566 1030 0
favstats(~Superplasticizer..component.5..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 0 0 6.3 10.2 32.2 6.203204 5.973035 1030 0
favstats(~Coarse.Aggregate...component.6..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 801 932 968 1029.4 1145 972.9189 77.75395 1030 0
favstats(~Fine.Aggregate..component.7..kg.in.a.m.3.mixture., data=data)
## min Q1 median Q3 max mean sd n missing
## 594 730.95 779.5 824 992.6 773.5795 80.1758 1030 0
favstats(~Age..day., data=data)
## min Q1 median Q3 max mean sd n missing
## 1 7 28 56 365 45.66214 63.16991 1030 0
favstats(~Concrete.compressive.strength.MPa..megapascals.., data=data)
## min Q1 median Q3 max mean sd n missing
## 2.33 23.71 34.445 46.135 82.6 35.81796 16.70574 1030 0
summary(data)
## Cement..component.1..kg.in.a.m.3.mixture.
## Min. :102.0
## 1st Qu.:192.4
## Median :272.9
## Mean :281.2
## 3rd Qu.:350.0
## Max. :540.0
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 22.00
## Mean : 73.89
## 3rd Qu.:142.95
## Max. :359.40
## Fly.Ash..component.3..kg.in.a.m.3.mixture.
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 54.19
## 3rd Qu.:118.30
## Max. :200.10
## Water...component.4..kg.in.a.m.3.mixture.
## Min. :121.8
## 1st Qu.:164.9
## Median :185.0
## Mean :181.6
## 3rd Qu.:192.0
## Max. :247.0
## Superplasticizer..component.5..kg.in.a.m.3.mixture.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 6.300
## Mean : 6.203
## 3rd Qu.:10.200
## Max. :32.200
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## Min. : 801.0
## 1st Qu.: 932.0
## Median : 968.0
## Mean : 972.9
## 3rd Qu.:1029.4
## Max. :1145.0
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. Age..day.
## Min. :594.0 Min. : 1.00
## 1st Qu.:731.0 1st Qu.: 7.00
## Median :779.5 Median : 28.00
## Mean :773.6 Mean : 45.66
## 3rd Qu.:824.0 3rd Qu.: 56.00
## Max. :992.6 Max. :365.00
## Concrete.compressive.strength.MPa..megapascals..
## Min. : 2.33
## 1st Qu.:23.71
## Median :34.45
## Mean :35.82
## 3rd Qu.:46.13
## Max. :82.60
# Normalise the data
data = normalize(data, method = "standardize", range = c(0, 1), margin = 1L, on.constant = "quiet")
#
# 4. Check for the assumptions of Regression on the loaded dataset.
#
# Assumption 1 : Checking if linear regression is linear in parameters
input_form = paste(input_column, collapse="+")
formula = as.formula(paste("Concrete.compressive.strength.MPa..megapascals.. ~ ", input_form))
mod <- lm(formula=formula, data=data)
summary(mod)
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.71518 -0.37728 0.04205 0.39297 2.06192
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -8.666e-16 1.940e-02
## Cement..component.1..kg.in.a.m.3.mixture. 7.493e-01 5.311e-02
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 5.363e-01 5.235e-02
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 3.368e-01 4.821e-02
## Water...component.4..kg.in.a.m.3.mixture. -1.920e-01 5.137e-02
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 1.042e-01 3.341e-02
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 8.396e-02 4.373e-02
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 9.670e-02 5.137e-02
## Age..day. 4.319e-01 2.052e-02
## t value Pr(>|t|)
## (Intercept) 0.000 1.000000
## Cement..component.1..kg.in.a.m.3.mixture. 14.110 < 2e-16 ***
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 10.245 < 2e-16 ***
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 6.987 5.07e-12 ***
## Water...component.4..kg.in.a.m.3.mixture. -3.739 0.000195 ***
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 3.118 0.001870 **
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 1.920 0.055122 .
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 1.883 0.060044 .
## Age..day. 21.047 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6225 on 1021 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6125
## F-statistic: 204.3 on 8 and 1021 DF, p-value: < 2.2e-16
# Assumption 2 : The mean of residuals is zero -> Holds
mean(mod$residuals)
## [1] -2.90011e-17
# Assumption 3 : Homoscedasticity of residuals or equal variance -> Holds
par(mfrow=c(2,2))
plot(mod)

par(mfrow=c(1,1))
# Assumption 4 : No autocorrelation of residuals -> Holds
acf(mod$residuals)

# Assumption 5 : Input and Target columns are uncorrelated -> Holds
checkCorrelationWithMod = function(X) {
print(cor.test(X, mod$residuals))
}
lapply(data, checkCorrelationWithMod)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -7.5441e-16, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -2.35294e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -2.8522e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -8.895645e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -1.8506e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -5.772011e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 1.1031e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 3.440393e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 2.658e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 8.290149e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 3.1017e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 9.673911e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -6.0088e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -1.874095e-16
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 2.7321e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 8.521194e-17
##
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 25.341, df = 1028, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5809978 0.6563001
## sample estimates:
## cor
## 0.620075
## $Cement..component.1..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -7.5441e-16, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -2.35294e-17
##
##
## $Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -2.8522e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -8.895645e-17
##
##
## $Fly.Ash..component.3..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -1.8506e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -5.772011e-17
##
##
## $Water...component.4..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 1.1031e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 3.440393e-17
##
##
## $Superplasticizer..component.5..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 2.658e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 8.290149e-17
##
##
## $Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 3.1017e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 9.673911e-17
##
##
## $Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -6.0088e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## -1.874095e-16
##
##
## $Age..day.
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 2.7321e-15, df = 1028, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06108321 0.06108321
## sample estimates:
## cor
## 8.521194e-17
##
##
## $Concrete.compressive.strength.MPa..megapascals..
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 25.341, df = 1028, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5809978 0.6563001
## sample estimates:
## cor
## 0.620075
# Assumption 6 : The number of observations must be greater than number of Xs -> Holds
nrow(data) > ncol(data)
## [1] TRUE
# Assumption 7 : Variability of X -> Holds
checkVariate = function(X) {
print(var(X))
}
lapply(data, checkVariate)
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## $Cement..component.1..kg.in.a.m.3.mixture.
## [1] 1
##
## $Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## [1] 1
##
## $Fly.Ash..component.3..kg.in.a.m.3.mixture.
## [1] 1
##
## $Water...component.4..kg.in.a.m.3.mixture.
## [1] 1
##
## $Superplasticizer..component.5..kg.in.a.m.3.mixture.
## [1] 1
##
## $Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## [1] 1
##
## $Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## [1] 1
##
## $Age..day.
## [1] 1
##
## $Concrete.compressive.strength.MPa..megapascals..
## [1] 1
# Assumption 8 : Check if X and Y have inverse relationship -> Does not hold for Age..dat column
for(i in input_column) {
plot(data[[i]], data$Concrete.compressive.strength.MPa..megapascals.., xlab = i)
}








# Assumption 9 :-No perfect multicollinearity -> Does not hold
vif(mod)
## Cement..component.1..kg.in.a.m.3.mixture.
## 7.489003
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture.
## 7.277091
## Fly.Ash..component.3..kg.in.a.m.3.mixture.
## 6.171400
## Water...component.4..kg.in.a.m.3.mixture.
## 7.006340
## Superplasticizer..component.5..kg.in.a.m.3.mixture.
## 2.963862
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture.
## 5.077042
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture.
## 7.006677
## Age..day.
## 1.118366
# Assumption 10:-Normality of residuals
par(mfrow=c(2,2))
plot(mod)

par(mfrow=c(1,1))
#
# 5. Split the whole dataset into training (80%) and testing (20%).
#
dt = sort(sample(nrow(data), nrow(data)*.8))
train<-data[dt,]
test<-data[-dt,]
#
# 6. Design a predictive model for predicting the target attribute from training data.
#
model = lm(Concrete.compressive.strength.MPa..megapascals..~., data=train)
sigma(model)
## [1] 0.6057782
summary(model)$coef
## Estimate Std. Error
## (Intercept) -0.006002755 0.02111245
## Cement..component.1..kg.in.a.m.3.mixture. 0.773844472 0.05739294
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 0.562334081 0.05722500
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 0.362659830 0.05185148
## Water...component.4..kg.in.a.m.3.mixture. -0.142887962 0.05445239
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.124861170 0.03578007
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 0.094514514 0.04743451
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 0.132264387 0.05554560
## Age..day. 0.442730251 0.02271578
## t value Pr(>|t|)
## (Intercept) -0.284323 7.762351e-01
## Cement..component.1..kg.in.a.m.3.mixture. 13.483269 1.505214e-37
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 9.826721 1.296304e-21
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 6.994204 5.562499e-12
## Water...component.4..kg.in.a.m.3.mixture. -2.624090 8.850647e-03
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 3.489685 5.095441e-04
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 1.992527 4.664646e-02
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 2.381186 1.748577e-02
## Age..day. 19.489986 9.639390e-70
#
# 7. Apply the designed model on test data.
#
pred = model%>%predict(test)
summary(model)
##
## Call:
## lm(formula = Concrete.compressive.strength.MPa..megapascals.. ~
## ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.72646 -0.36095 0.03941 0.38011 1.99629
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -0.006003 0.021112
## Cement..component.1..kg.in.a.m.3.mixture. 0.773844 0.057393
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 0.562334 0.057225
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 0.362660 0.051851
## Water...component.4..kg.in.a.m.3.mixture. -0.142888 0.054452
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 0.124861 0.035780
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 0.094515 0.047435
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 0.132264 0.055546
## Age..day. 0.442730 0.022716
## t value Pr(>|t|)
## (Intercept) -0.284 0.77624
## Cement..component.1..kg.in.a.m.3.mixture. 13.483 < 2e-16 ***
## Blast.Furnace.Slag..component.2..kg.in.a.m.3.mixture. 9.827 < 2e-16 ***
## Fly.Ash..component.3..kg.in.a.m.3.mixture. 6.994 5.56e-12 ***
## Water...component.4..kg.in.a.m.3.mixture. -2.624 0.00885 **
## Superplasticizer..component.5..kg.in.a.m.3.mixture. 3.490 0.00051 ***
## Coarse.Aggregate...component.6..kg.in.a.m.3.mixture. 1.993 0.04665 *
## Fine.Aggregate..component.7..kg.in.a.m.3.mixture. 2.381 0.01749 *
## Age..day. 19.490 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6058 on 815 degrees of freedom
## Multiple R-squared: 0.6371, Adjusted R-squared: 0.6335
## F-statistic: 178.8 on 8 and 815 DF, p-value: < 2.2e-16
actual = test$Concrete.compressive.strength.MPa..megapascals..
predicted = pred
range(test$Concrete.compressive.strength.MPa..megapascals..)
## [1] -1.854929 2.749476
#
# 8. Evaluate the designed model by means of RMSE or MAE.
#
rmse(actual, predicted) # In the target range -1.7 to 2.8, our model can accurately predict with an error boundary of 0.6
## [1] 0.688342
mae(actual, predicted)
## [1] 0.5461329